import csv
import re
import zipfile
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class IntegratedFile:
    round: str
    edition: str
    zip_path: Path


VAR_ID_RE = re.compile(r'<h3 id="([^"]+)"')


def iter_integrated_zips(ess_data_dir: Path) -> list[IntegratedFile]:
    integrated_dir = ess_data_dir / "integrated_files"
    items: list[IntegratedFile] = []
    for zip_path in sorted(integrated_dir.glob("ESS*_integrated_*.zip")):
        # Name pattern: ESS{round}_integrated_e{edition}.zip
        name = zip_path.stem
        m = re.match(r"^ESS(?P<round>\d+)_integrated_e(?P<edition>[\d.]+)$", name)
        if not m:
            continue
        items.append(IntegratedFile(round=m.group("round"), edition=m.group("edition"), zip_path=zip_path))
    return items


def find_codebook_name(zf: zipfile.ZipFile) -> str | None:
    # Typical: "ESS1_integrated_e6.7 codebook.html"
    for n in zf.namelist():
        if n.lower().endswith("codebook.html"):
            return n
    return None


def extract_variable_ids_from_codebook_html(html: str) -> set[str]:
    return {m.group(1).strip() for m in VAR_ID_RE.finditer(html)}


def read_codebook_vars(zip_path: Path) -> tuple[str | None, set[str]]:
    with zipfile.ZipFile(zip_path, "r") as zf:
        codebook_name = find_codebook_name(zf)
        if not codebook_name:
            return None, set()
        html = zf.read(codebook_name).decode("utf-8", errors="ignore")
        return codebook_name, extract_variable_ids_from_codebook_html(html)


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    ess_data_dir = root / "ess_data"
    out_dir = root / "outputs"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Variables of interest for the causal plan (B: inequality) + identification.
    vars_of_interest = [
        # IDs / geography / weights
        "cntry",
        "regunit",
        "region",
        "dweight",
        "pspwght",
        "pweight",
        "anweight",
        # Internet use (endogenous)
        "netuse",
        "netusoft",
        "netustm",
        # News/media (controls/mechanisms)
        "nwspol",
        "tvtot",
        "tvpol",
        "rdtot",
        "rdpol",
        "nwsptot",
        "nwsppol",
        # Participation outcomes
        "vote",
        "contplt",
        "badge",
        "sgnptit",
        "pbldmn",
        "pbldmna",
        "bctprd",
        "wrkprty",
        "wrkorg",
        "pstplonl",
        # Inequality dimensions / controls
        "eisced",
        "eduyrs",
        "agea",
        "gndr",
        "hinctnta",
        "brncntr",
        "domicil",
        "uempla",
    ]
    interest_set = set(vars_of_interest)

    rows = []
    index_rows = []
    for f in iter_integrated_zips(ess_data_dir):
        codebook_name, vars_in_round = read_codebook_vars(f.zip_path)
        rows.append(
            {
                "round": f.round,
                "edition": f.edition,
                "zip_path": str(f.zip_path.relative_to(root)),
                "codebook_name": codebook_name or "",
                "variables_found": len(vars_in_round),
            }
        )
        present = {v: (1 if v in vars_in_round else 0) for v in vars_of_interest}
        index_rows.append(
            {
                "round": f.round,
                "edition": f.edition,
                **present,
            }
        )

    out_files = out_dir / "ess_integrated_files_index.csv"
    with out_files.open("w", newline="", encoding="utf-8") as fp:
        w = csv.DictWriter(fp, fieldnames=list(rows[0].keys()) if rows else ["round"])
        w.writeheader()
        w.writerows(rows)

    out_presence = out_dir / "ess_integrated_variable_presence.csv"
    with out_presence.open("w", newline="", encoding="utf-8") as fp:
        w = csv.DictWriter(fp, fieldnames=["round", "edition", *vars_of_interest])
        w.writeheader()
        w.writerows(index_rows)

    print(f"Wrote: {out_files}")
    print(f"Wrote: {out_presence}")


if __name__ == "__main__":
    main()

